import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import mglearn
import operator
from math import log

def majorityCnt(classList):
        classCount = {}
        for vote in classList:
            if vote not in classCount.keys():
                classCount[vote] = 0
            classCount[vote] += 1
        sortedClassCount = sorted(classCount.items(), key=operator.itemgetter(1), reverse=True)
        return sortedClassCount[0][0]

def calcShannonEnt(dataSet):  # 计算数据的熵(entropy)

    numEntries=len(dataSet)  # 数据条数
    labelCounts={'R':0,'M':0}
    for index,featVec in dataSet.iterrows():
        currentLabel=featVec[-1] # 每行数据的最后一个字（类别）
        labelCounts[currentLabel]+=1  # 统计有多少个类以及每个类的数量
    shannonEnt=0
    for key in labelCounts:
        prob=float(labelCounts[key])/numEntries # 计算单个类的熵值
        if prob==0:
            shannonEnt-=0
        else:shannonEnt-=prob*log(prob,2) # 累加每个类的熵值
    return shannonEnt
def splitDataSet(dataSet,axis,value): # 按某个特征分类后的数据，取大于该值
    retDataSet=pd.DataFrame()
    retDataSet2=pd.DataFrame()
    reducedFeatvec=dataSet.copy()
    for index,featVec in reducedFeatvec.iterrows():
        if featVec[axis]>=value:
            retDataSet=retDataSet.append(featVec)
        else:
            retDataSet2=retDataSet2.append(featVec)
    return retDataSet,retDataSet2

def chooseBestFeatureToSplit(dataSet):
    numFeatures = len(dataSet.columns)
    baseEntropy = calcShannonEnt(dataSet)#原始数据的熵
    bestInfoGain=-1
    bestFeature=0
    flag=0
    count=0
    for i in range(numFeatures):#遍历所有的特征，取一个特征开始循环
        featList=dataSet.iloc[:,i]
        uniqueVals = set(featList)#取该列所有的值，并代入计算。将每一个值设为一个分类对象进行分类
        for value in uniqueVals:
            newEntropy = 0

            subDataSet,subDataSet2 = splitDataSet(dataSet, dataSet.columns[i], value)
            if len(subDataSet)==0 or len(subDataSet2)==0:
                continue
            prob = len(subDataSet) / float(len(dataSet))
            prob1= len(subDataSet2) /float(len(dataSet))
            newEntropy += prob * calcShannonEnt(subDataSet)  # 按特征分类后的熵
            newEntropy += prob1 * calcShannonEnt(subDataSet2)
            infoGain = baseEntropy - newEntropy  # 原始熵与按特征分类后的熵的差值
            if (infoGain > bestInfoGain):  # 若按某特征划分后，熵值减少的最大，则次特征为最优分类特征
                bestInfoGain = infoGain
                bestFeature = i
                flag=value
            count+=1
        print(count,bestInfoGain)
    return bestFeature,flag
def dropDataSet(dataSet,axis,value): # 按某个特征分类后的数据，取大于该值
    retDataSet=pd.DataFrame()
    reducedFeatvec=dataSet.copy()
    for index,featVec in reducedFeatvec.iterrows():
        if featVec[axis]<value:
            retDataSet=retDataSet.append(featVec)
    return retDataSet
def TreeGenerate(dataSet,labels):

        classList = [example[-1] for example in dataSet]  # 类别：R或L
        # 若全都属于一类，则返回该类。
        if classList.count(classList[0]) == len(classList):
            return classList[0]
        # 若总共只有一个特征，则返回类别中数量最多的类。
        if dataSet.shape[1] == 2:
            return majorityCnt(classList)
        bestFeat,flag = chooseBestFeatureToSplit(dataSet)  # 选择最优特征,同时返回该特征的分类位置。
        bestFeatLabel = labels[bestFeat]
        myTree = {bestFeatLabel: {},'value':flag}
        dataSet=dataSet.drop(columns=[labels[bestFeat]])
        temp,dataSet=splitDataSet(dataSet,bestFeat,flag)
        print(dataSet.shape)
        del(labels[bestFeat])
        myTree[bestFeatLabel] = TreeGenerate(dataSet,labels)
        return myTree



dataSet=pd.read_csv('sonar.csv')
dataSet=dataSet.head(5)
pd.set_option("display.max_columns", 1000000)
pd.set_option('display.width', 10000)
labels=dataSet.columns.tolist()
Tree=TreeGenerate(dataSet,labels)
print(Tree)